#pandas module
import pandas as pd


#data visualization module
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo


#set default settings
pd.set_option('mode.chained_assignment',None)
pyo.init_notebook_mode()


data  = pd.read_csv('dataset.csv', engine = 'python')


data.shape

(9994, 9)


data.head()


data.dtypes

Order Date        object
Customer Name     object
State             object
Category          object
Sub-Category      object
Product Name      object
Sales            float64
Quantity           int64
Profit           float64
dtype: object


data['Order Date'] = pd.to_datetime(data['Order Date'])
from_ = data['Order Date'].min()
to_ = data['Order Date'].max()
print('We have the sales information from:', from_, "to", to_)

We have the sales information from: 2014-01-03 00:00:00 to 2017-12-30 00:00:00


data = data.sort_values(by= 'Order Date')
data.head()


pd.DatetimeIndex(data['Order Date']).year

Int64Index([2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014, 2014,
            ...
            2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017],
           dtype='int64', name='Order Date', length=9994)


data['Year'] = pd.DatetimeIndex(data['Order Date']).year
data['Month'] = pd.DatetimeIndex(data['Order Date']).month
data['Day'] = pd.DatetimeIndex(data['Order Date']).day

data.head()


data_time_yearly_profit = data.groupby(['Year', 'Category']).agg({'Profit':'sum'}).reset_index()


data_time_yearly_profit.head()


px.line(data_time_yearly_profit, x='Year', y='Profit', color= 'Category')


data_time_monthly_profit = data.groupby(['Year', 'Month', 'Category']).agg({'Profit':'sum'}).reset_index()


data_time_monthly_profit['Date'] = data_time_monthly_profit.Year.astype(str)+ '-' + data_time_monthly_profit.Month.astype(str)+ "-01"
data_time_monthly_profit.head()


px.line(data_time_monthly_profit, x='Date', y='Profit', color='Category')


len(data['Customer Name'].unique())

793


customer_data = data.groupby('Year').agg({'Customer Name':'nunique'}).reset_index()
customer_data


px.line(customer_data, x='Year', y='Customer Name')


top10_customers = data.groupby('Customer Name').agg({'Profit':'sum'}).reset_index().sort_values('Profit', ascending=False).head(10)
top10_customers


px.bar(top10_customers, x='Customer Name', y='Profit')


state_data = data.groupby('State').agg({'Profit':'sum'}).reset_index()
state_data


state_codes = {
        'Alabama': 'AL',
        'Alaska': 'AK',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'Delaware': 'DE',
        'District of Columbia': 'DC',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New Hampshire': 'NH',
        'New Jersey': 'NJ',
        'New Mexico': 'NM',
        'New York': 'NY',
        'North Carolina': 'NC',
        'North Dakota': 'ND',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Rhode Island': 'RI',
        'South Carolina': 'SC',
        'South Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY'
}


state_data.State = state_data.State.map(state_codes)


state_data


px.choropleth( state_data,
             locations="State",
             color='Profit',
             locationmode="USA-states",
             scope='usa',
             title='Proft gained in different states',
             color_continuous_scale='Blugrn')


tech_data = data[data.Category == 'Technology'].groupby('State').agg({'Profit':'sum'}).reset_index()

px.choropleth( tech_data,
             locations=tech_data.State.map(state_codes),
             color='Profit',
             locationmode="USA-states",
             scope='usa',
             title='Proft gained in different states selling tech product',
             color_continuous_scale='Pubu')


data


product_data = data.groupby(['Category', 'Sub-Category']).agg({'Profit':'sum'}).reset_index()
product_data = product_data[product_data.Profit >0]
product_data['Sales'] = 'Any'
product_data


import plotly.express as px


fig = px.sunburst(product_data , path = ['Sales','Category','Sub-Category'] , values = 'Profit')

fig.show()


fig = px. treemap(product_data , path = ['Sales','Category','Sub-Category'] , values = 'Profit')

fig.show()


data.head()


data.Quantity.describe()

count    9994.000000
mean        3.789574
std         2.225110
min         1.000000
25%         2.000000
50%         3.000000
75%         5.000000
max        14.000000
Name: Quantity, dtype: float64


px.histogram(data, x='Quantity')


px.box(data, y= 'Quantity', x= 'Category', color='Year')


data.Profit.describe()

count    9994.000000
mean       28.656896
std       234.260108
min     -6599.978000
25%         1.728750
50%         8.666500
75%        29.364000
max      8399.976000
Name: Profit, dtype: float64


px.box(data, y='Profit')


seasons = {
    1 : "Winter",
    2 : "Spring",
    3 : "Summer",
    4 : "Fall"
}


data['Season'] = data.Month.astype(int) % 12 //3+1
data.Season = data.Season.map(seasons)
data.head()


summer_data = data[data.Season == 'Summer']
summer_data.head()


summer_data_agg = summer_data.groupby(['Year', 'Category', 'Season']).agg({'Quantity':'sum'}).reset_index()
summer_data_agg.head()


px.bar(summer_data_agg, x=summer_data_agg.Year.astype('str'),
      y='Quantity',
      color='Category',
      barmode='group')

	Order Date	Customer Name	State	Category	Sub-Category	Product Name	Sales	Quantity	Profit
0	11/8/2016	Claire Gute	Kentucky	Furniture	Bookcases	Bush Somerset Collection Bookcase	261.9600	2	41.9136
1	11/8/2016	Claire Gute	Kentucky	Furniture	Chairs	Hon Deluxe Fabric Upholstered Stacking Chairs,...	731.9400	3	219.5820
2	6/12/2016	Darrin Van Huff	California	Office Supplies	Labels	Self-Adhesive Address Labels for Typewriters b...	14.6200	2	6.8714
3	10/11/2015	Sean O'Donnell	Florida	Furniture	Tables	Bretford CR4500 Series Slim Rectangular Table	957.5775	5	-383.0310
4	10/11/2015	Sean O'Donnell	Florida	Office Supplies	Storage	Eldon Fold 'N Roll Cart System	22.3680	2	2.5164

	Order Date	Customer Name	State	Category	Sub-Category	Product Name	Sales	Quantity	Profit
7980	2014-01-03	Darren Powers	Texas	Office Supplies	Paper	Message Book, Wirebound, Four 5 1/2" X 4" Form...	16.448	2	5.5512
739	2014-01-04	Phillina Ober	Illinois	Office Supplies	Labels	Avery 508	11.784	3	4.2717
740	2014-01-04	Phillina Ober	Illinois	Office Supplies	Storage	SAFCO Boltless Steel Shelving	272.736	3	-64.7748
741	2014-01-04	Phillina Ober	Illinois	Office Supplies	Binders	GBC Standard Plastic Binding Systems Combs	3.540	2	-5.4870
1759	2014-01-05	Mick Brown	Pennsylvania	Office Supplies	Art	Avery Hi-Liter EverBold Pen Style Fluorescent ...	19.536	3	4.8840

	Order Date	Customer Name	State	Category	Sub-Category	Product Name	Sales	Quantity	Profit	Year	Month	Day
7980	2014-01-03	Darren Powers	Texas	Office Supplies	Paper	Message Book, Wirebound, Four 5 1/2" X 4" Form...	16.448	2	5.5512	2014	1	3
739	2014-01-04	Phillina Ober	Illinois	Office Supplies	Labels	Avery 508	11.784	3	4.2717	2014	1	4
740	2014-01-04	Phillina Ober	Illinois	Office Supplies	Storage	SAFCO Boltless Steel Shelving	272.736	3	-64.7748	2014	1	4
741	2014-01-04	Phillina Ober	Illinois	Office Supplies	Binders	GBC Standard Plastic Binding Systems Combs	3.540	2	-5.4870	2014	1	4
1759	2014-01-05	Mick Brown	Pennsylvania	Office Supplies	Art	Avery Hi-Liter EverBold Pen Style Fluorescent ...	19.536	3	4.8840	2014	1	5

	Year	Category	Profit
0	2014	Furniture	5457.7255
1	2014	Office Supplies	22593.4161
2	2014	Technology	21492.8325
3	2015	Furniture	3015.2029
4	2015	Office Supplies	25099.5338

	Year	Month	Category	Profit	Date
0	2014	1	Furniture	805.4665	2014-1-01
1	2014	1	Office Supplies	788.9506	2014-1-01
2	2014	1	Technology	855.7736	2014-1-01
3	2014	2	Furniture	120.6917	2014-2-01
4	2014	2	Office Supplies	176.0910	2014-2-01

Exploratory and Confirmatory Data Analysis¶

Now, In this dataset, we have the following information:¶

Data Exploration: Time information¶

Data Exploration: Customer Aspect¶

Data Exploration: Location Aspect¶

Need to create a choropleth map¶

Data Exploration - Hierarchical Information about the products¶

Data Exploration: Product Sales information (Sales, Quantity, Profit)¶

Distribution Analysis on Quantity column¶

Confirmatory Data Analysis (CDA)¶

Lets make some assumptions about the data, and validate them by using different exploration techniques:¶

Thanks for viewing.¶

	Customer Name	Profit
730	Tamara Chand	8981.3239
622	Raymond Buch	6976.0959
671	Sanjit Chand	5757.4119
334	Hunter Lopez	5622.4292
6	Adrian Barton	5444.8055
757	Tom Ashbrook	4703.7883
157	Christopher Martinez	3899.8904
431	Keith Dawkins	3038.6254
35	Andy Reiter	2884.6208
194	Daniel Raglin	2869.0760

	State	Profit
0	Alabama	5786.8253
1	Arizona	-3427.9246
2	Arkansas	4008.6871
3	California	76381.3871
4	Colorado	-6527.8579
5	Connecticut	3511.4918
6	Delaware	9977.3748
7	District of Columbia	1059.5893
8	Florida	-3399.3017
9	Georgia	16250.0433
10	Idaho	826.7231
11	Illinois	-12607.8870
12	Indiana	18382.9363
13	Iowa	1183.8119
14	Kansas	836.4435
15	Kentucky	11199.6966
16	Louisiana	2196.1023
17	Maine	454.4862
18	Maryland	7031.1788
19	Massachusetts	6785.5016
20	Michigan	24463.1876
21	Minnesota	10823.1874
22	Mississippi	3172.9762
23	Missouri	6436.2105
24	Montana	1833.3285
25	Nebraska	2037.0942
26	Nevada	3316.7659
27	New Hampshire	1706.5028
28	New Jersey	9772.9138
29	New Mexico	1157.1161
30	New York	74038.5486
31	North Carolina	-7490.9122
32	North Dakota	230.1497
33	Ohio	-16971.3766
34	Oklahoma	4853.9560
35	Oregon	-1190.4705
36	Pennsylvania	-15559.9603
37	Rhode Island	7285.6293
38	South Carolina	1769.0566
39	South Dakota	394.8283
40	Tennessee	-5341.6936
41	Texas	-25729.3563
42	Utah	2546.5335
43	Vermont	2244.9783
44	Virginia	18597.9504
45	Washington	33402.6517
46	West Virginia	185.9216
47	Wisconsin	8401.8004
48	Wyoming	100.1960

	Category	Sub-Category	Profit	Sales
1	Furniture	Chairs	26590.1663	Any
2	Furniture	Furnishings	13059.1436	Any
4	Office Supplies	Appliances	18138.0054	Any
5	Office Supplies	Art	6527.7870	Any
6	Office Supplies	Binders	30221.7633	Any
7	Office Supplies	Envelopes	6964.1767	Any
8	Office Supplies	Fasteners	949.5182	Any
9	Office Supplies	Labels	5546.2540	Any
10	Office Supplies	Paper	34053.5693	Any
11	Office Supplies	Storage	21278.8264	Any
13	Technology	Accessories	41936.6357	Any
14	Technology	Copiers	55617.8249	Any
15	Technology	Machines	3384.7569	Any
16	Technology	Phones	44515.7306	Any

	Order Date	Customer Name	State	Category	Sub-Category	Product Name	Sales	Quantity	Profit	Year	Month	Day	Season
245	2014-06-01	Dianna Wilson	Minnesota	Office Supplies	Storage	Safco Steel Mobile File Cart	166.72	2	41.6800	2014	6	1	Summer
307	2014-06-01	Corey Roper	New Jersey	Office Supplies	Art	Boston Heavy-Duty Trimline Electric Pencil Sha...	289.20	6	83.8680	2014	6	1	Summer
246	2014-06-01	Dianna Wilson	Minnesota	Office Supplies	Paper	Adams Telephone Message Book w/Frequently-Call...	47.88	6	23.9400	2014	6	1	Summer
247	2014-06-01	Dianna Wilson	Minnesota	Office Supplies	Appliances	Honeywell Enviracaire Portable HEPA Air Cleane...	1503.25	5	496.0725	2014	6	1	Summer
248	2014-06-01	Dianna Wilson	Minnesota	Office Supplies	Paper	Xerox 205	25.92	4	12.4416	2014	6	1	Summer

	Year	Customer Name
0	2014	595
1	2015	573
2	2016	638
3	2017	693